# IMPORT PHASE 1 ---------------------------------------------------------------

df_raw <- read_csv("data/confidential-data-not-for-publication/raw/GRI Main v2_WIDE.csv") %>%
  filter(!is.na(group_id)) %>%
  mutate(
    submission_date_1 = SubmissionDate %>% str_split(",") %>% map_chr(1) %>% mdy(),
    submission_date_2 = SubmissionDate %>% str_split(",") %>% map_chr(1) %>% dmy()
  ) %>%
  mutate(
    # remove testing entries from before data collection
    submission_date_1 = if_else(submission_date_1 < ymd("2023/03/20"), NA_Date_, submission_date_1),
    submission_date_2 = if_else(submission_date_2 < ymd("2023/03/20"), NA_Date_, submission_date_2)
  ) %>%
  mutate(
    submission_date = coalesce(submission_date_1, submission_date_2)
  ) %>%

  filter_track(!is.na(submission_date)) %>%
  filter(group_id != "04F002") %>%  # didn't do the survey

  # Remove submissions from accompaniments
  filter(
    !(ind_id == "04F007_2" & surveyor_id == -97),
    !(ind_id == "04F008_3" & surveyor_id == -97)
  ) %>%

  relocate(submission_date) %>%
  filter(!str_detect(group_id, "^test\\d$")) %>%
  filter_track(
    !(surveyor_id_other %in% c("Test_Jayachandran", "Test_Tamilselvi"))
  ) %>%
  mutate(
    delivery_incentive = ifelse(formdef_version < 2303291349, NA, delivery_incentive)
  ) 

# CORRECTIONS - PHASE 1 ---------------------------------------------------------------

# Functions for corrections
coalesce_convert <- function(x, y) {
  if (is.numeric(y)) {
    coalesce(as.numeric(x), y)
  } else {
    coalesce(x, y)
  }
}


correct_value <- function(data, var, correction_df = corrections_vals) {
  print(var)
  data %>%
    mutate("{var}_original" := !!sym(var)) %>%
    left_join(
      corrections_vals %>% filter(question_id == var) %>% select(ind_id, corrected_val), by = "ind_id"
    ) %>%
    mutate("{var}_corrected" := corrected_val) %>%
    mutate("{var}" := coalesce_convert(!!sym(paste0(var, "_corrected")), !!sym(var))) %>%
    select(-corrected_val)
}

mask_name_like_cols <- function(data) {
  name_cols <- names(data)[grepl("_name($|_|\\d)", names(data))]
  if (!length(name_cols)) {
    return(data)
  }
  data %>%
    mutate(across(all_of(name_cols), ~ replace(.x, TRUE, NA)))
}


# 1. Correct IDS
corrections_id <- read_excel("data/raw/corrections.xlsx", "ID corrections")

df_correct_ids <- df_raw %>%
  left_join(corrections_id %>% mutate(across(everything(), as.character))) %>%
  mutate(ind_id = coalesce(ind_id_new, ind_id),
         group_id = coalesce(group_id_new, group_id)) %>%
  filter(!is.na(group_id)) %>%
  mutate(
    group_label = ifelse(group == 1, "discuss", "no_discuss"),
  )



# 2. Correct VALS
corrections_vals <- read_excel("data/raw/corrections.xlsx", "value corrections")

questions_to_correct <- corrections_vals %>% filter(resolved == "yes") %>%
  pull(question_id) %>% unique



df_corrected <- df_correct_ids

for (i in seq_along(questions_to_correct)) {
  df_corrected <- correct_value(data = df_corrected, var = questions_to_correct[[i]])
}


# Final dataset with new var definitions
df <- df_corrected %>%
  mutate(
    arm_label = paste0(group_video, "+", group_label)
  ) %>%
  mutate(surveyor_name = ifelse(surveyor_id == -97, surveyor_id_other, surveyor_name)) %>%
  mutate(surveyor_name = paste0(surveyor_id, ": ", surveyor_name)) %>%
  group_by(group_id) %>%
  mutate(
    across(c(sc_colour, sc_colour_check, sc_colour_check_label),
           ~ first_non_na(.x))
  )



# CHECK FOR DUPLICATES

df %>% dups_stop(KEY)
df %>% dups_stop(ind_id)

# df %>% dups_view(ind_id)

# Anonymising
df <- df %>%
tidylog::select(
    -matches("supervisor_label|surveyor.*_label|surveyor_id_other|location|group_name_\\d|group_person_\\d|ind_name|group_predic_name_*|group_person_att_i_\\d|enum_name_for_delivery"),
    -matches("address|phone|instanceName|ind_name|gps|other_hints|sector|\\[")
  ) %>%
  mask_name_like_cols()

df_phase_1 <- df

# IMPORT PHASE 2 ---------------------------------------------------------------

first_date <- ymd("2023/05/27") # to change to 25th (pre-test)

df_raw <- read_csv("data/confidential-data-not-for-publication/raw/GRI Phase 2_WIDE.csv") %>%
  filter(!is.na(group_id)) %>%
  mutate(survey_date = case_when(
    ind_id == "03M210_2" ~ ymd("2023-06-24"),
    KEY == "uuid:ecb96e01-2b97-44e5-b013-f55ce38cf8ee" ~ ymd("2023-05-29"),
    TRUE ~ survey_date
  )) %>%

  mutate(survey_date = ymd(survey_date)) %>%
  mutate(
    survey_date = if_else(ind_id == "02F203_3", ymd("2023-05-30"), survey_date)
  ) %>%
  filter(
    KEY != "uuid:b66d56db-d280-48ca-b261-5eacadd7326a"
  ) %>%
  filter_track(
    survey_date >= first_date
  ) %>%
  count_prop(survey_date) %>%
  relocate(survey_date) %>%
  filter(!str_detect(group_id, "^test\\d$")) %>%

  mutate(survey_completed = as.numeric(survey_status == 0)) %>%

  # Remove erroneous surveys - where surveyors made an error
  filter(
    group_id != "01F216" & group_id != "01F222" & group_id != "03M206"
  ) 


# 1. Correct IDs
df_correct_ids <- df_raw %>%
  left_join(corrections_id %>% mutate(across(everything(), as.character))) %>%
  mutate(ind_id = coalesce(ind_id_new, ind_id),
         group_id = coalesce(group_id_new, group_id)) %>%
  filter(!is.na(group_id)) %>%
  mutate(
    group_label = ifelse(group == 1, "discuss", "no_discuss"),
  )

# 2. Correct VALS
questions_to_correct <- corrections_vals %>% filter(resolved == "yes") %>%
  pull(question_id) %>% unique

df_corrected <- df_correct_ids

for (i in seq_along(questions_to_correct)) {
  df_corrected <- correct_value(data = df_corrected, var = questions_to_correct[[i]])
}


# Final dataset with new var definitions
df <- df_corrected %>%
  mutate(
    arm_label = paste0(group_video, "+", group_label)
  ) %>%
  mutate(surveyor_name = ifelse(surveyor_id == -97, surveyor_id_other, surveyor_name)) %>%
  mutate(surveyor_name = paste0(surveyor_id, ": ", surveyor_id_label)) %>%
  group_by(group_id) %>%
  mutate(
    across(c(sc_colour, matches("sc_colour_check")),
           ~ first_non_na(.x))
  ) %>%
  mutate(arm_label = fct_relevel(arm_label, "control+no_discuss")) %>%
  ungroup

if (is.null(df$discuss_type)) df$discuss_type <- NA_character_

# EXPORT DATASET
df %>% dups_stop(KEY)
df %>% count_prop()
# Anonymising
df <- df %>%
  tidylog::select(
    -matches("supervisor_label|surveyor.*_label|surveyor_id_other|location|group_name_\\d|group_person_\\d|ind_name|group_predic_name_*|group_person_att_i_\\d|enum_name_for_delivery"),
    -matches("address|phone|instanceName|ind_name|gps|other_hints|sector|\\[")
  ) %>%
  mask_name_like_cols()
df_phase_2 <- df
# Extras - rand details, questions, choices ---------------------------------------------------------------

questions_phase_1 <- import_survey_questions("questionnaire/main_questionnaire_v3.xlsx") %>%
  mutate(label = str_replace_all(label, "\n", " "))
choices_phase_1 <- import_survey_choices("questionnaire/main_questionnaire_v3.xlsx")

questions_phase_2 <- import_survey_questions("questionnaire/main_questionnaire_v4.xlsx") %>%
  mutate(label = str_replace_all(label, "\n", " "))
choices_phase_2 <- import_survey_choices("questionnaire/main_questionnaire_v4.xlsx")

id_vars <- c("survey_date", "surveyor_name", "team_id", "group_id", "ind_id", "ind_name")
demo_vars <- c("age", "female", "speaks_english", "reads_english", "hindu", "bachelors", "married", "employed", "landlord", "children", "employer", "hh_size", "hh_food_exp_pc")
control_vars <- c(
  demo_vars %>% unname(),
  "phase_2",
  "b11", "b12", "i1", "i2", "i3", "dq12",
  "normally_receives_delivery",
  "item_diff",
  "r2_reliability_diff",
  "r2_reliability_shown",
  "r2_reliability_benchmark"
)

rand_details_raw <- read_csv("data/raw/randomisation_details.csv")
rand_details_raw_phase2 <- read_csv("data/raw/randomisation_details_phase2.csv")

rand_details <- bind_rows(
  "phase_1" = rand_details_raw %>% select(-photo_gender),
  "phase_2" = rand_details_raw_phase2,
  .id = "phase"
) %>%
  filter(str_detect(round, "r(1|2)|follow_up")) %>%
  group_by(ind_id, round, pair_includes_trans, item_diff, listener, speaker_1, speaker_2, is_listener, is_speaker, quality_diff, quality_included, photo_order_pair_id) %>%
  mutate(pair_includes_trans = as.logical(pair_includes_trans),
         pair_includes_female = any(str_detect(photo_id, "^W"))) %>%
  glimpse %>%
  mutate(comparator_order_in_pair = row_number()) %>%
  filter(comparator) %>%
  select(comparator_order_in_pair, pair_includes_female)

# Combine phase 1 and 2 ---------------------------------------------------------------

df_comb <- bind_rows(
  "phase_1" = df_phase_1 %>% mutate(across(matches("^group_\\d$|^survey_status$|survey_partial_reason|other_present_who|c7|hiring_choice_refusal_reason_6"), as.character)),
  "phase_2" = df_phase_2 %>% mutate(across(c(b8.1_other, c6, survey_refusal_reason, survey_difficulty_sections), as.character)),
  .id = "phase"
) %>%
  mutate(phase_2 = as.numeric(phase == "phase_2"))

controls_to_impute <- c("b3", "b4", "b5", "b6", "b6.1", "b7", "b8", "b13", "b14", "b11", "b12", "i1", "i2", "i3", "dq12")

df <- df_comb %>%

  # REMOVE INCORRECTLY DONE GROUPS - with incorrect choice sheets
  tidylog::filter(
    group_id != c("04M215"), # incorrect choice sheets
    group_id != "02F233"     # respondent was in pilot study
  ) %>%

  ungroup %>%
  tidylog::mutate(
    across(all_of(controls_to_impute),
           list(missing = ~ ifelse(.x < 0, TRUE, FALSE))),
    across(all_of(controls_to_impute),
           ~ ifelse(.x < 0, NA, .x)),
    across(all_of(controls_to_impute),
           ~ ifelse(is.na(.x), median_na(.x), .x)),
  ) %>%

  mutate(
    age = b1,
    speaks_english = b8.1_label %>% str_detect("English"),
    reads_english = b8.2_label %>% str_detect("English"),
    hindu = b3 == 1,
    bachelors = b4 >= 6,
    married = b5 == 1,
    employed = b6 == 3,
    landlord = b6.1 == 1,
    children = b7 > 0,
    employer = b8 == 1,
    hh_size = b13,
    hh_size_above_med = hh_size > median(hh_size, na.rm = TRUE),
    hh_food_exp_pc = b14 / b13,
    hh_food_exp_pc_above_med = hh_food_exp_pc > median(hh_food_exp_pc, na.rm = TRUE),
    normally_receives_delivery = str_split(dq9, " ") %>% map_lgl(~ "1" %in% .x),
  ) %>%

  mutate(female = target_female) %>%
  mutate(across(all_of(demo_vars), as.numeric)) %>%
  mutate(arm_label = fct_relevel(arm_label, "control+no_discuss")) %>%
  mutate(group_label = fct_relevel(group_label, "no_discuss")) %>%
  mutate(delivery_incentive_exp = fct_explicit_na(delivery_incentive)) %>%

  # Add on discuss_type
  mutate(discuss_type = case_when(
    group == 0 ~ "control",
    choosing_only == 1 ~ "choosing_only",
    discussion_pair == 1 ~ "discussion_pair",
    discussion_full == 1 ~ "discussion_full",
    phase == "phase_1" & group == 1 ~ "discussion_full",
  )) %>%

  # POOL RELEVANT GROUPS
  mutate(
    treat_type_r2 = case_when(
      discuss_type == "control"   ~ "control",
      discuss_type == "choosing_only" & announce_after == 1 ~ "choices_control",
      discuss_type == "choosing_only" & announce_before == 1 ~ "choices_observer",
      discuss_type == "discussion_pair" & is_listener == 1 ~ "discussion_listener",
      discuss_type == "discussion_pair" & is_listener == 0 ~ "discussion_pair",
      discuss_type == "discussion_full" ~ "discussion_full"
    ),
    treat_type_r1 = case_when(
      discuss_type == "control"   ~ "control",
      discuss_type == "choosing_only" ~ "public",
      discuss_type == "discussion_pair" & is_listener == 1 ~ NA_character_,
      discuss_type == "discussion_pair" & is_listener == 0 ~ "discussion_pair",
      discuss_type == "discussion_full" ~ "discussion_full"
    )
  ) %>%
  mutate(treat_type_r2 = fct_relevel(factor(treat_type_r2), c("control", "choices_control", "choices_observer", "discussion_listener", "discussion_pair", "discussion_full"))) %>%
  mutate(treat_type_r1 = fct_relevel(factor(treat_type_r1), c("control", "public", "discussion_pair", "discussion_full"))) %>%
  mutate(discussant = treat_type_r2 %in% c("discussion_full", "discussion_pair")) %>%
  mutate(
    treat_type_r1_label = fct_recode(treat_type_r1,
                                     "No discussion (private)" = "control",
                                     "No discussion (public)" = "public",
                                     "2-person discussion (speakers)" = "discussion_pair",
                                     "3-person discussion" = "discussion_full"),
    treat_type_r1_pooled = fct_recode(
      treat_type_r1,
      "No discussion (private)" = "control",
      "No discussion (public)" = "public",
      "Discussion" = "discussion_pair",
      "Discussion" = "discussion_full"
    ),
    treat_type_r2_label = fct_recode(
      treat_type_r2,
      "No discussion (private)" = "control",
      "No discussion (public, non-observer)" = "choices_control",
      "No discussion (public, observer)" = "choices_observer",
      "2-person discussion (speakers)" = "discussion_pair",
      "2-person discussion (listener)" = "discussion_listener",
      "3-person discussion" = "discussion_full"
    ),
    treat_type_r2_pooled = fct_recode(
      treat_type_r2,
      "No discussion (private)" = "control",
      "No discussion (public)" = "choices_control",
      "No discussion (public)" = "choices_observer",
      "Discussion" = "discussion_pair",
      "Discussion" = "discussion_listener",
      "Discussion" = "discussion_full"
    ),
    control = discuss_type == "control",
    public = discuss_type == "choosing_only",
    public_observer = discuss_type == "choosing_only" & announce_before == 1,
    public_non_observer = discuss_type == "choosing_only" & announce_after == 1,
    discussion_pair_speaker = discuss_type == "discussion_pair" & is_speaker == 1,
    discussion_pair_listener = discuss_type == "discussion_pair" & is_listener == 1,
    discussion_pooled = (discuss_type == "discussion_pair" & is_speaker %in% 1) | discuss_type == "discussion_full"
  ) %>%
  mutate(educ = ifelse(b4 == 8, 5.5, b4)) %>%
  mutate(high_educ = b4 > 4) %>%

  mutate(
    others_could_hear_answers = f2.0 %in% 1 | (is.na(f2.0) & f2.1 %in% 1) | (is.na(f2.0) & f2.1_single %in% 1)
  ) %>%

  mutate(
    discuss_type = case_when(
      phase == "phase_1" & group == 0 ~ "control",
      phase == "phase_1" & group == 1 ~ "discussion_full",
      TRUE ~ discuss_type
    )
  ) %>%
  left_join(
    tibble(
      discuss_type = c("control", "discussion_full", "discussion_pair", "choosing_only"),
      discuss_type_label = c("No discussion (private)", "3-person discussion", "2-person discussion", "No discussion (public)")
    )
  ) %>%
  mutate(discussion_full = discuss_type == "discussion_full") %>%
  mutate(discuss_type = fct_relevel(factor(discuss_type), c("control", "choosing_only", "discussion_pair", "discussion_full"))) %>%
  mutate(discuss_type_label = fct_relevel(factor(discuss_type_label), c("No discussion (private)", "No discussion (public)", "2-person discussion", "3-person discussion"))) %>%

  mutate(
    audio_refused = ifelse(group_role == 1 & discuss_type != "choosing_only",
                           audio_consent_lead %in% 0,
                           NA)
  ) %>%
  group_by(group_id) %>%
  fill(audio_refused, .direction = "updown") %>%
  mutate(audio_refused = ifelse(discuss_type %in% c("control", "choosing_only"), FALSE, audio_refused)) %>%

  mutate(
    across(c(discussion_full,
             normally_receives_delivery,
             public, public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener),
           as.numeric)
  ) %>%
  mutate(
    video_type_control = as.numeric(video_type == "control"),
    video_type_placebo = as.numeric(video_type == "placebo"),
    video_type_treatment = as.numeric(video_type == "treatment")
  ) %>%
  mutate(discussion_pair = ifelse(is.na(discussion_pair), FALSE, discussion_pair)) %>%
  mutate(
    across(c(public, public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener, discussion_full, discussion_pooled, control),
           as.numeric)
  ) %>%
  ungroup

# Import follow up (phase 1) ---------------------------------------------------------------

df_follow_up <- read_csv("data/confidential-data-not-for-publication/raw/GRI Follow Up Delivery v3_WIDE.csv") %>%
  mutate(submission_date = SubmissionDate %>% str_sub(1, 10) %>% dmy) %>%
  relocate(submission_date) %>%

  # DROP PURPLE SCRATCHCARDS
  tidylog::left_join(df %>% select(ind_id, sc_colour), by = "ind_id") %>%
  tidylog::filter(!(sc_colour %in% "purple")) %>%

  select(-group) %>%
  left_join(df %>% select(ind_id, group), by = "ind_id") %>%
  mutate(
    group_label = ifelse(group == 1, "discuss", "no_discuss"),
  ) %>%
  mutate(group_label = fct_relevel(factor(group_label), "no_discuss")) %>%
  mutate(phase = "phase_1") %>%
  mutate(
    discuss_type = ifelse(phase == "phase_1" & group == 1, "discussion_full", "control"),
  ) %>%
  filter(submission_date >= ymd("2023/03/07")) %>%
  mutate(follow_up_attempted = TRUE,
         discussion_full = discuss_type == "discussion_full") %>%
  mutate(stratum_id = paste0(
    group_id %>% str_sub(2, 2),
    "_",
    group_id %>% str_sub(3, 3)
  )) %>%

  mutate(
    ind_id = ind_id %>%
      str_replace_all("01F047_1B", "01F047_alt_1") %>%
      str_replace_all("01F047_2B", "01F047_alt_2") %>%
      str_replace_all("01F047_3B", "01F047_alt_3")
  ) %>%

  # FOR DUPLICATES, Keep the first one
  mutate(date_time = dmy_hms(SubmissionDate)) %>%
  group_by(ind_id) %>%
  arrange(date_time) %>%
  mutate(row_i = row_number()) %>%
  ungroup() %>%
  tidylog::filter(row_i == 1) %>%
  select(-row_i)

# Anonymising
df_follow_up <- df_follow_up %>%
  tidylog::select(
    -matches("supervisor_label|surveyor.*_label|surveyor_id_other|location|group_name_\\d|group_person_\\d|ind_name|group_predic_name_*|group_person_att_i_\\d|enum_name_for_delivery"),
    -matches("address|phone|instanceName|ind_name|gps|other_hints|sector|\\[")
  ) %>%
  mask_name_like_cols()

# Import follow up (phase 2) ---------------------------------------------------------------

df_follow_up_phase_2 <- read_csv("data/confidential-data-not-for-publication/raw/GRI Follow Up Delivery - Phase 2_WIDE.csv") %>%
  tidylog::filter(deviceid != "(web)") %>%
  # FOR DUPLICATES, Keep the first one
  mutate(date_time = dmy_hms(SubmissionDate)) %>%
  group_by(ind_id) %>%
  arrange(date_time) %>%
  mutate(row_i = row_number()) %>%
  ungroup() %>%
  tidylog::filter(row_i == 1) %>%
  select(-row_i)

df_follow_up <- bind_rows(
  df_follow_up %>% mutate(across(c(other_present_who, other_present_notes), as.character)),
  df_follow_up_phase_2 %>% mutate(across(c(phone), as.character))
) %>%
  select(-discuss_type, -phase) %>%
  tidylog::left_join(df %>% select(ind_id, discuss_type, phase, public, public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener, discussion_full, discussion_pooled, control),
                     by = "ind_id", suffix = c("", "_extra"))

# Anonymising
df_follow_up_phase_2 <- df_follow_up_phase_2 %>%
  tidylog::select(
   -matches("supervisor_label|surveyor.*_label|surveyor_id_other|location|group_name_\\d|group_person_\\d|ind_name|group_predic_name_*|group_person_att_i_\\d|enum_name_for_delivery"),
    -matches("address|phone|instanceName|ind_name|gps|other_hints|sector|\\[")
  ) %>%
  mask_name_like_cols()

# Import supplementary data collection ---------------------------------------------------------------
supp <- read_csv("data/confidential-data-not-for-publication/raw/Trans Pilot_WIDE.csv") %>%
  filter(
    survey_completed == 1| survey_status == 2
  ) %>%
  # Deal with duplicate from first day
  group_by(pid) %>%
  mutate(row_i = row_number()) %>%
  ungroup %>%
  mutate(
    across(c(id, pid),
           ~ ifelse(.x == "D1001007" & row_i == 2, "D1001007b", .x))
  ) %>%
  select(-row_i) %>%
  dups_report(pid) %>%
  mutate(arm_label = paste0(ifelse(group == 1, "group", "ind"), "-", video_type)) %>%
  mutate(
    arm_label = fct_relevel(arm_label, "ind-control")
  )
# Anonymising
supp <- supp %>%
  tidylog::select(
    -matches("supervisor_label|surveyor.*_label|surveyor_id_other|location|group_name_\\d|group_person_\\d|ind_name|group_predic_name_*|group_person_att_i_\\d|enum_name_for_delivery"),
    -matches("address|phone|instanceName|ind_name|gps|other_hints|sector|\\["),
    -matches("awc|b0")
  ) 

# Exporting
write.csv(supp, "data/cleaned/Trans_recog_wide_anon.csv")
# Anonymising photo rating survey ---------------------------------------------------------------
d_raw <- haven::read_sav("data/confidential-data-not-for-publication/raw/photo_rating_raw.sav")
head(d_raw)
col_d_raw <- names(d_raw)

d_raw <- d_raw %>%
  tidylog::select(
    -matches("IPAddress|RecipientLastName|RecipientFirstName|RecipientEmail")
  )
# Exporting
haven::write_sav(d_raw, "data/photo_rating_survey/photo_rating_raw_anon.sav")

# EXPORT ---------------------------------------------------------------

# EXPORT DATASET
save(df, df_follow_up, rand_details, questions_phase_1, questions_phase_2,
     demo_vars, control_vars,
     choices_phase_1, choices_phase_2,
     file = "data/cleaned/data_clean.RData")

long_names <- df %>% names() %>% .[str_length(.) > 30]

df %>%
  select(-matches("\\[")) %>%
  select(-any_of(long_names)) %>%
  janitor::clean_names() %>%
  rename_with(~ str_replace_all(.x, "\\.", "__")) %>%
  select(matches("gps")) %>%
  haven::write_dta("data/cleaned/data_clean.dta")
